Update:

9/21

  1. Time series charts: add moving average lines to
  2. Heatmap of months and zones: calculate log, order by zones
#description(zone-days-start-end-files-hours of speech)
zone_list<-unique(Bpc_freq$zone)

des<-data.frame(zone=zone_list)
for (i in 1:length(zone_list)){
  des[i,"ndays"]<-nrow(unique(Bpc_freq[which(Bpc_freq$zone==zone_list[i]),'date']))
  des[i,"start"]<-head(unique(Bpc_freq[which(Bpc_freq$zone==zone_list[i]),'ts']),1)
  des[i,"end"]<-tail(unique(Bpc_freq[which(Bpc_freq$zone==zone_list[i]),'ts']),1)
  des[i,"nfiles"]<-nrow(Bpc_freq[which(Bpc_freq$zone==zone_list[i]),])
  des[i,"num_bpc"]<-sum(Bpc_freq[which(Bpc_freq$zone==zone_list[i]),"num_bpc"])
}
print(des)
##       zone ndays               start                 end nfiles num_bpc
## 1   Zone 1   360 2018-08-04 06:07:00 2019-08-05 15:26:00  16781 2659392
## 2  Zone 10   358 2018-08-02 23:47:00 2019-07-31 23:18:00  16905 4343509
## 3  Zone 11   357 2018-08-05 01:34:00 2019-07-31 23:05:00  16000 1567307
## 4  Zone 12   356 2018-08-07 05:17:00 2019-07-31 23:19:00  16305 4083850
## 5  Zone 13   357 2018-08-02 23:48:00 2019-07-31 23:18:00  16822 2169841
## 6   Zone 2   351 2018-08-01 01:22:00 2019-07-31 23:17:00  15996 2114226
## 7   Zone 3   183 2018-08-04 06:04:00 2019-02-02 10:22:00   8603 1130190
## 8   Zone 4   354 2018-08-04 06:36:00 2019-07-31 23:11:00  16305 4318849
## 9   Zone 5   186 2018-08-01 10:39:00 2019-02-02 17:30:00   8466 1618740
## 10  Zone 6   239 2018-10-30 09:13:00 2019-06-30 23:38:00  11312 2607037
## 11  Zone 8   356 2018-08-06 06:03:00 2019-07-31 23:04:00  16716 5000410

Visualization

Focus on zone 1

#plot

#ggplot(data = zone_1, aes(x=ts, y=num_bpc)) +  geom_col(position = 'dodge')+theme(axis.text.x = element_text(angle=90,hjust=1))

plot(xts(zone_1[,5],order.by=zone_1[,1]),main="zone 1")

decompose-trend-seasonal-random

library(TTR)
tss<-ts(zone_1[,5],freq=1800,start=1)
ts_components<-decompose(tss)
plot(ts_components)

ts_seasonally_adjusted<-tss-ts_components$seasonal
plot(ts_seasonally_adjusted)

stationary test

print(acf(data$num_bpc, lag.max = 20, main="auto-correlation test(20)"))

## 
## Autocorrelations of series 'data$num_bpc', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.610  0.499  0.440  0.380  0.326  0.279  0.230  0.175  0.125  0.078 
##     11     12     13     14     15     16     17     18     19     20 
##  0.033 -0.012 -0.037 -0.058 -0.081 -0.089 -0.084 -0.090 -0.091 -0.083
print(pacf(data$num_bpc, lag.max = 20, main="partial auto-correlation test"))

## 
## Partial autocorrelations of series 'data$num_bpc', by lag
## 
##      1      2      3      4      5      6      7      8      9     10     11 
##  0.610  0.202  0.122  0.052  0.022  0.007 -0.013 -0.036 -0.039 -0.041 -0.043 
##     12     13     14     15     16     17     18     19     20 
## -0.048 -0.021 -0.015 -0.020 -0.002  0.015 -0.003 -0.002  0.007
acf(data$num_bpc, lag.max = 200, main="auto-correlation test(200)")

non-stationary timeseries of seasonal-change

kpss unit root test

library(urca)
summary(ur.kpss(data$num_bpc))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 14 lags. 
## 
## Value of test-statistic is: 8.4413 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

value of test > critical value (1pt), refuse H0:stationary, =>non-stationary

diff

# use KPSS test to determine the appropriate number of differences
# nonstationary->stationary

library(forecast)
ndiffs(tss)
## [1] 1

first-order difference.

summary(ur.kpss(diff(tss)))
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 14 lags. 
## 
## Value of test-statistic is: 8e-04 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

differenced data are stationary.

tss %>% diff %>% ggtsdisplay(main="")

According to the ACF and PACF, we could set an initial candidate model ARIMA(1,0,1) or use autoarima(min AIC).

auto-arima

# auto-arima
ts_adj<-diff(tss)
fit<-auto.arima(ts_adj,seasonal = FALSE)
summary(fit)
## Series: ts_adj 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ma1
##       0.2041  -0.6929
## s.e.  0.0156   0.0122
## 
## sigma^2 estimated as 3949:  log likelihood=-93288.02
## AIC=186582   AICc=186582   BIC=186605.2
## 
## Training set error measures:
##                     ME    RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.0198953 62.8359 49.05602 NaN  Inf 0.6307193 -0.002759664

This is an ARIMA(1,0,1) model.

Yt = c + 0.2041Yt-1 -0.6929 epsilon t-1 + epsilon t

fit %>% forecast(h=10) %>% autoplot(include=80)

check residuals

checkresiduals(fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1) with zero mean
## Q* = 12500, df = 3354, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 3356

P-value, residuals are not white-noise, use ARCH model

res<-fit$residuals
res2<-res^2
par(mfrow=c(1,2))
acf(as.vector(res2),main="ACF of Squared Residuals")
pacf(as.vector(res2),main="PACF of Squared Residuals")

some spikes are out of the white noise bands, i.e there is a correlation between them, that is an indication of heteroscedasticity problem and usage of ARCH-GARCH model

Eagles ARCH test

library(MTS)
archTest(res)
## Q(m) of squared series(LM test):  
## Test statistic:  194.1281  p-value:  0 
## Rank-based Test:  
## Test statistic:  152.2412  p-value:  0

p value less than alpha, reject H0, exist ARCH effects

GARCH

library(rugarch)
spec = ugarchspec()
def.fit=ugarchfit(spec=spec, data=ts_adj)
print(def.fit)
## 
## *---------------------------------*
## *          GARCH Model Fit        *
## *---------------------------------*
## 
## Conditional Variance Dynamics    
## -----------------------------------
## GARCH Model  : sGARCH(1,1)
## Mean Model   : ARFIMA(1,0,1)
## Distribution : norm 
## 
## Optimal Parameters
## ------------------------------------
##         Estimate  Std. Error     t value Pr(>|t|)
## mu      0.041241    0.183852     0.22432 0.822509
## ar1     0.199464    0.015481    12.88444 0.000000
## ma1    -0.691977    0.012091   -57.22840 0.000000
## omega   7.897761    1.705693     4.63023 0.000004
## alpha1  0.005688    0.000447    12.73897 0.000000
## beta1   0.992300    0.000021 46307.37785 0.000000
## 
## Robust Standard Errors:
##         Estimate  Std. Error     t value Pr(>|t|)
## mu      0.041241    0.121949  3.3819e-01 0.735224
## ar1     0.199464    0.017682  1.1281e+01 0.000000
## ma1    -0.691977    0.013116 -5.2757e+01 0.000000
## omega   7.897761    2.241616  3.5232e+00 0.000426
## alpha1  0.005688    0.000581  9.7894e+00 0.000000
## beta1   0.992300    0.000002  5.8399e+05 0.000000
## 
## LogLikelihood : -93204.87 
## 
## Information Criteria
## ------------------------------------
##                    
## Akaike       11.110
## Bayes        11.113
## Shibata      11.110
## Hannan-Quinn 11.111
## 
## Weighted Ljung-Box Test on Standardized Residuals
## ------------------------------------
##                         statistic p-value
## Lag[1]                    0.02088 0.88511
## Lag[2*(p+q)+(p+q)-1][5]   4.26764 0.03258
## Lag[4*(p+q)+(p+q)-1][9]   9.15467 0.02070
## d.o.f=2
## H0 : No serial correlation
## 
## Weighted Ljung-Box Test on Standardized Squared Residuals
## ------------------------------------
##                         statistic p-value
## Lag[1]                      71.31       0
## Lag[2*(p+q)+(p+q)-1][5]     84.03       0
## Lag[4*(p+q)+(p+q)-1][9]     87.62       0
## d.o.f=2
## 
## Weighted ARCH LM Tests
## ------------------------------------
##             Statistic Shape Scale P-Value
## ARCH Lag[3]     4.834 0.500 2.000 0.02791
## ARCH Lag[5]     8.724 1.440 1.667 0.01343
## ARCH Lag[7]     9.353 2.315 1.543 0.02596
## 
## Nyblom stability test
## ------------------------------------
## Joint Statistic:  1.9688
## Individual Statistics:              
## mu     0.00441
## ar1    0.82383
## ma1    0.57602
## omega  0.43604
## alpha1 0.59209
## beta1  0.53307
## 
## Asymptotic Critical Values (10% 5% 1%)
## Joint Statistic:          1.49 1.68 2.12
## Individual Statistic:     0.35 0.47 0.75
## 
## Sign Bias Test
## ------------------------------------
##                      t-value      prob sig
## Sign Bias            1.08140 2.795e-01    
## Negative Sign Bias   0.02982 9.762e-01    
## Positive Sign Bias   9.19164 4.313e-20 ***
## Joint Effect       180.64764 6.392e-39 ***
## 
## 
## Adjusted Pearson Goodness-of-Fit Test:
## ------------------------------------
##   group statistic p-value(g-1)
## 1    20     345.2    9.888e-62
## 2    30     379.0    1.296e-62
## 3    40     414.2    3.136e-64
## 4    50     435.2    2.379e-63
## 
## 
## Elapsed time : 1.205869

1-estimated parameters: except mu parameter, all parameters are significant 

Yt = 0.00441 + 0.82383 R1 + 0.57602 A1
Omega^2 = 0.43604 + 0.59209(At-1)^2 + 0.53307(At-2)^2

2-Ljung-Box Test: residuals do not have autocorrelation 

3-ARCH LM Test: GARCH process is adequately fitted